First, we install our packages and then, we load them;

install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/USER/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USER\AppData\Local\Temp\RtmpCE4191\downloaded_packages
install.packages("Tmisc", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/USER/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'Tmisc' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USER\AppData\Local\Temp\RtmpCE4191\downloaded_packages
install.packages("janitor", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/USER/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'janitor' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\USER\AppData\Local\Temp\RtmpCE4191\downloaded_packages
install.packages("lubridate", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/USER/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'lubridate' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lubridate'
## Warning in file.copy(savedcopy, lib, recursive = TRUE):
## problem copying C:\Users\USER\AppData\Local\R\win-
## library\4.2\00LOCK\lubridate\libs\x64\lubridate.dll to C:
## \Users\USER\AppData\Local\R\win-library\4.2\lubridate\libs\x64\lubridate.dll:
## Permission denied
## Warning: restored 'lubridate'
## 
## The downloaded binary packages are in
##  C:\Users\USER\AppData\Local\Temp\RtmpCE4191\downloaded_packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(Tmisc)
## Warning: package 'Tmisc' was built under R version 4.2.1
library(janitor)
## Warning: package 'janitor' was built under R version 4.2.1
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)

Note the following

Data Source is: https://divvy-tripdata.s3.amazonaws.com/index.html Also see data license agreement here:https://ride.divvybikes.com/data-license-agreement

First, we set our directory though

getwd()
## [1] "C:/Users/USER/Documents/Downloads/Divvy_Trips_2019"
setwd( "C:/Users/USER/Documents/Downloads/Divvy_Trips_2019")

Now, we load our data

q1_2019 <- read_csv("Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## dttm (2): start_time, end_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q2_2019 <- read_csv("Divvy_Trips_2019_Q2.csv")
## Rows: 1108163 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): 03 - Rental Start Station Name, 02 - Rental End Station Name, User...
## dbl  (5): 01 - Rental Details Rental ID, 01 - Rental Details Bike ID, 03 - R...
## dttm (2): 01 - Rental Details Local Start Time, 01 - Rental Details Local En...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q3_2019 <- read_csv("Divvy_Trips_2019_Q3.csv")
## Rows: 1640718 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## dttm (2): start_time, end_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q4_2019 <- read_csv("Divvy_Trips_2019_Q4.csv")
## Rows: 704054 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## dttm (2): start_time, end_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Comparing Column names

colnames(q1_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
colnames(q2_2019)
##  [1] "01 - Rental Details Rental ID"                   
##  [2] "01 - Rental Details Local Start Time"            
##  [3] "01 - Rental Details Local End Time"              
##  [4] "01 - Rental Details Bike ID"                     
##  [5] "01 - Rental Details Duration In Seconds Uncapped"
##  [6] "03 - Rental Start Station ID"                    
##  [7] "03 - Rental Start Station Name"                  
##  [8] "02 - Rental End Station ID"                      
##  [9] "02 - Rental End Station Name"                    
## [10] "User Type"                                       
## [11] "Member Gender"                                   
## [12] "05 - Member Details Member Birthday Year"
colnames(q3_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"
colnames(q4_2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

There are inconsistencies in Column names, so now we rename columns to gain uniformity

(q1_2019 <- rename(q1_2019
                   ,ride_id = trip_id
                   ,rideable_type = bikeid 
                   ,started_at = start_time  
                   ,ended_at = end_time  
                   ,start_station_name = from_station_name 
                   ,start_station_id = from_station_id 
                   ,end_station_name = to_station_name 
                   ,end_station_id = to_station_id 
                   ,member_casual = usertype))
(q2_2019 <- rename(q2_2019
                   ,ride_id = "01 - Rental Details Rental ID"
                   ,rideable_type = "01 - Rental Details Bike ID" 
                   ,started_at = "01 - Rental Details Local Start Time"  
                   ,ended_at = "01 - Rental Details Local End Time"  
                   ,start_station_name = "03 - Rental Start Station Name" 
                   ,start_station_id = "03 - Rental Start Station ID"
                   ,end_station_name = "02 - Rental End Station Name" 
                   ,end_station_id = "02 - Rental End Station ID"
                   ,member_casual = "User Type"))
(q3_2019 <- rename(q3_2019
                   ,ride_id = trip_id
                   ,rideable_type = bikeid 
                   ,started_at = start_time  
                   ,ended_at = end_time  
                   ,start_station_name = from_station_name 
                   ,start_station_id = from_station_id 
                   ,end_station_name = to_station_name 
                   ,end_station_id = to_station_id 
                   ,member_casual = usertype))
(q4_2019 <- rename(q4_2019
                   ,ride_id = trip_id
                   ,rideable_type = bikeid 
                   ,started_at = start_time  
                   ,ended_at = end_time  
                   ,start_station_name = from_station_name 
                   ,start_station_id = from_station_id 
                   ,end_station_name = to_station_name 
                   ,end_station_id = to_station_id 
                   ,member_casual = usertype))

Let’s inspect the data frames

str(q1_2019)
## spec_tbl_df [365,069 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
##  $ started_at        : POSIXct[1:365069], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ ended_at          : POSIXct[1:365069], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ rideable_type     : num [1:365069] 2167 4386 1524 252 1170 ...
##  $ tripduration      : num [1:365069] 390 441 829 1783 364 ...
##  $ start_station_id  : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ member_casual     : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:365069] "Male" "Female" "Female" "Male" ...
##  $ birthyear         : num [1:365069] 1989 1990 1994 1993 1994 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(q4_2019)
## spec_tbl_df [704,054 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:704054] 25223640 25223641 25223642 25223643 25223644 ...
##  $ started_at        : POSIXct[1:704054], format: "2019-10-01 00:01:39" "2019-10-01 00:02:16" ...
##  $ ended_at          : POSIXct[1:704054], format: "2019-10-01 00:17:20" "2019-10-01 00:06:34" ...
##  $ rideable_type     : num [1:704054] 2215 6328 3003 3275 5294 ...
##  $ tripduration      : num [1:704054] 940 258 850 2350 1867 ...
##  $ start_station_id  : num [1:704054] 20 19 84 313 210 156 84 156 156 336 ...
##  $ start_station_name: chr [1:704054] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : num [1:704054] 309 241 199 290 382 226 142 463 463 336 ...
##  $ end_station_name  : chr [1:704054] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ member_casual     : chr [1:704054] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:704054] "Male" "Male" "Female" "Male" ...
##  $ birthyear         : num [1:704054] 1987 1998 1991 1990 1987 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(q3_2019)
## spec_tbl_df [1,640,718 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:1640718] 23479388 23479389 23479390 23479391 23479392 ...
##  $ started_at        : POSIXct[1:1640718], format: "2019-07-01 00:00:27" "2019-07-01 00:01:16" ...
##  $ ended_at          : POSIXct[1:1640718], format: "2019-07-01 00:20:41" "2019-07-01 00:18:44" ...
##  $ rideable_type     : num [1:1640718] 3591 5353 6180 5540 6014 ...
##  $ tripduration      : num [1:1640718] 1214 1048 1554 1503 1213 ...
##  $ start_station_id  : num [1:1640718] 117 381 313 313 168 300 168 313 43 43 ...
##  $ start_station_name: chr [1:1640718] "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
##  $ end_station_id    : num [1:1640718] 497 203 144 144 62 232 62 144 195 195 ...
##  $ end_station_name  : chr [1:1640718] "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
##  $ member_casual     : chr [1:1640718] "Subscriber" "Customer" "Customer" "Customer" ...
##  $ gender            : chr [1:1640718] "Male" NA NA NA ...
##  $ birthyear         : num [1:1640718] 1992 NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(q2_2019)
## spec_tbl_df [1,108,163 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id                                         : num [1:1108163] 22178529 22178530 22178531 22178532 22178533 ...
##  $ started_at                                      : POSIXct[1:1108163], format: "2019-04-01 00:02:22" "2019-04-01 00:03:02" ...
##  $ ended_at                                        : POSIXct[1:1108163], format: "2019-04-01 00:09:48" "2019-04-01 00:20:30" ...
##  $ rideable_type                                   : num [1:1108163] 6251 6226 5649 4151 3270 ...
##  $ 01 - Rental Details Duration In Seconds Uncapped: num [1:1108163] 446 1048 252 357 1007 ...
##  $ start_station_id                                : num [1:1108163] 81 317 283 26 202 420 503 260 211 211 ...
##  $ start_station_name                              : chr [1:1108163] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ end_station_id                                  : num [1:1108163] 56 59 174 133 129 426 500 499 211 211 ...
##  $ end_station_name                                : chr [1:1108163] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ member_casual                                   : chr [1:1108163] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ Member Gender                                   : chr [1:1108163] "Male" "Female" "Male" "Male" ...
##  $ 05 - Member Details Member Birthday Year        : num [1:1108163] 1975 1984 1990 1993 1992 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `01 - Rental Details Rental ID` = col_double(),
##   ..   `01 - Rental Details Local Start Time` = col_datetime(format = ""),
##   ..   `01 - Rental Details Local End Time` = col_datetime(format = ""),
##   ..   `01 - Rental Details Bike ID` = col_double(),
##   ..   `01 - Rental Details Duration In Seconds Uncapped` = col_number(),
##   ..   `03 - Rental Start Station ID` = col_double(),
##   ..   `03 - Rental Start Station Name` = col_character(),
##   ..   `02 - Rental End Station ID` = col_double(),
##   ..   `02 - Rental End Station Name` = col_character(),
##   ..   `User Type` = col_character(),
##   ..   `Member Gender` = col_character(),
##   ..   `05 - Member Details Member Birthday Year` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Convert ride_id and rideable_type to character

q1_2019 <-  mutate(q1_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type))
q2_2019 <-  mutate(q2_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type))
q3_2019 <-  mutate(q3_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type)) 
q4_2019 <-  mutate(q4_2019, ride_id = as.character(ride_id)
                   ,rideable_type = as.character(rideable_type))

Combine data frames into one big data frame

all_trips <- bind_rows(q1_2019, q2_2019, q3_2019, q4_2019)

Let’s clean up a little

First, we’ll inspect

colnames(all_trips)
##  [1] "ride_id"                                         
##  [2] "started_at"                                      
##  [3] "ended_at"                                        
##  [4] "rideable_type"                                   
##  [5] "tripduration"                                    
##  [6] "start_station_id"                                
##  [7] "start_station_name"                              
##  [8] "end_station_id"                                  
##  [9] "end_station_name"                                
## [10] "member_casual"                                   
## [11] "gender"                                          
## [12] "birthyear"                                       
## [13] "01 - Rental Details Duration In Seconds Uncapped"
## [14] "Member Gender"                                   
## [15] "05 - Member Details Member Birthday Year"
nrow(all_trips)
## [1] 3818004
head(all_trips)
dim(all_trips)
## [1] 3818004      15
str(all_trips)  
## tibble [3,818,004 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ride_id                                         : chr [1:3818004] "21742443" "21742444" "21742445" "21742446" ...
##  $ started_at                                      : POSIXct[1:3818004], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ ended_at                                        : POSIXct[1:3818004], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ rideable_type                                   : chr [1:3818004] "2167" "4386" "1524" "252" ...
##  $ tripduration                                    : num [1:3818004] 390 441 829 1783 364 ...
##  $ start_station_id                                : num [1:3818004] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name                              : chr [1:3818004] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id                                  : num [1:3818004] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name                                : chr [1:3818004] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ member_casual                                   : chr [1:3818004] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender                                          : chr [1:3818004] "Male" "Female" "Female" "Male" ...
##  $ birthyear                                       : num [1:3818004] 1989 1990 1994 1993 1994 ...
##  $ 01 - Rental Details Duration In Seconds Uncapped: num [1:3818004] NA NA NA NA NA NA NA NA NA NA ...
##  $ Member Gender                                   : chr [1:3818004] NA NA NA NA ...
##  $ 05 - Member Details Member Birthday Year        : num [1:3818004] NA NA NA NA NA NA NA NA NA NA ...
summary(all_trips)
##    ride_id            started_at                    
##  Length:3818004     Min.   :2019-01-01 00:04:37.00  
##  Class :character   1st Qu.:2019-05-29 15:49:26.50  
##  Mode  :character   Median :2019-07-25 17:50:54.00  
##                     Mean   :2019-07-19 21:47:37.11  
##                     3rd Qu.:2019-09-15 06:48:05.75  
##                     Max.   :2019-12-31 23:57:17.00  
##                                                     
##     ended_at                      rideable_type       tripduration     
##  Min.   :2019-01-01 00:11:07.00   Length:3818004     Min.   :      61  
##  1st Qu.:2019-05-29 16:09:28.25   Class :character   1st Qu.:     405  
##  Median :2019-07-25 18:12:23.00   Mode  :character   Median :     696  
##  Mean   :2019-07-19 22:11:47.56                      Mean   :    1500  
##  3rd Qu.:2019-09-15 08:30:13.25                      3rd Qu.:    1257  
##  Max.   :2020-01-21 13:54:35.00                      Max.   :10628400  
##                                                      NA's   :1108163   
##  start_station_id start_station_name end_station_id  end_station_name  
##  Min.   :  1.0    Length:3818004     Min.   :  1.0   Length:3818004    
##  1st Qu.: 77.0    Class :character   1st Qu.: 77.0   Class :character  
##  Median :174.0    Mode  :character   Median :174.0   Mode  :character  
##  Mean   :201.7                       Mean   :202.6                     
##  3rd Qu.:289.0                       3rd Qu.:291.0                     
##  Max.   :673.0                       Max.   :673.0                     
##                                                                        
##  member_casual         gender            birthyear      
##  Length:3818004     Length:3818004     Min.   :1888     
##  Class :character   Class :character   1st Qu.:1979     
##  Mode  :character   Mode  :character   Median :1987     
##                                        Mean   :1984     
##                                        3rd Qu.:1992     
##                                        Max.   :2003     
##                                        NA's   :1465961  
##  01 - Rental Details Duration In Seconds Uncapped Member Gender     
##  Min.   :     61                                  Length:3818004    
##  1st Qu.:    426                                  Class :character  
##  Median :    742                                  Mode  :character  
##  Mean   :   1327                                                    
##  3rd Qu.:   1347                                                    
##  Max.   :4757640                                                    
##  NA's   :2709841                                                    
##  05 - Member Details Member Birthday Year
##  Min.   :1759                            
##  1st Qu.:1979                            
##  Median :1987                            
##  Mean   :1984                            
##  3rd Qu.:1992                            
##  Max.   :2014                            
##  NA's   :2890794

We will reassign some values

all_trips <-  all_trips %>% 
  mutate(member_casual = recode(member_casual,"Subscriber" = "member","Customer" = "casual"))
table(all_trips$member_casual)
## 
##  casual  member 
##  880637 2937367

Aggregating our data next…

all_trips$date <- as.Date(all_trips$started_at) 
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

Calculating ride length

all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)

str(all_trips)
## tibble [3,818,004 × 21] (S3: tbl_df/tbl/data.frame)
##  $ ride_id                                         : chr [1:3818004] "21742443" "21742444" "21742445" "21742446" ...
##  $ started_at                                      : POSIXct[1:3818004], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ ended_at                                        : POSIXct[1:3818004], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ rideable_type                                   : chr [1:3818004] "2167" "4386" "1524" "252" ...
##  $ tripduration                                    : num [1:3818004] 390 441 829 1783 364 ...
##  $ start_station_id                                : num [1:3818004] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name                              : chr [1:3818004] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id                                  : num [1:3818004] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name                                : chr [1:3818004] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ member_casual                                   : chr [1:3818004] "member" "member" "member" "member" ...
##  $ gender                                          : chr [1:3818004] "Male" "Female" "Female" "Male" ...
##  $ birthyear                                       : num [1:3818004] 1989 1990 1994 1993 1994 ...
##  $ 01 - Rental Details Duration In Seconds Uncapped: num [1:3818004] NA NA NA NA NA NA NA NA NA NA ...
##  $ Member Gender                                   : chr [1:3818004] NA NA NA NA ...
##  $ 05 - Member Details Member Birthday Year        : num [1:3818004] NA NA NA NA NA NA NA NA NA NA ...
##  $ date                                            : Date[1:3818004], format: "2019-01-01" "2019-01-01" ...
##  $ month                                           : chr [1:3818004] "01" "01" "01" "01" ...
##  $ day                                             : chr [1:3818004] "01" "01" "01" "01" ...
##  $ year                                            : chr [1:3818004] "19" "19" "19" "19" ...
##  $ day_of_week                                     : chr [1:3818004] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
##  $ ride_length                                     : 'difftime' num [1:3818004] 6.5 7.35 13.8166666666667 29.7166666666667 ...
##   ..- attr(*, "units")= chr "mins"

Our ride length is currently a factor,we’ll convert it to numeric

is.factor(all_trips$ride_length)
## [1] FALSE
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE

We need to remove our bad data, so we will create a second version of our dtaframe.We are doing this to ensure that our analysis is as accurate as it can be while also ensuring that we acknowledge the changes we’ve made to the data

all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]

Let’s make some more analysis

summary(all_trips_v2$ride_length)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      1.02      6.85     11.82     24.17     21.40 177200.37

we will now compare members and casual cyclistic users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)

Average ride time per day

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)

Let’s put that in order

all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Now,average ride time by each day for members vs casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)

We will now analyse by user type(member/casual) and weekday

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%  
  group_by(member_casual, weekday) %>%  
  summarise(number_of_rides = n()                        
            ,average_duration = mean(ride_length)) %>%      
  arrange(member_casual, weekday)                           
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Now for some visualization

number of rides(weekday vs number of rides):

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = "Number of Rides vs Users" )
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Duration of rides(weekday vs average duration)

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge")+
  labs(title = "Ride Length vs Users")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Let’s save a summary file now, for future analysis Link:https://docs.google.com/spreadsheets/d/1NghhwxNyMhkrzvcg7GT55eAA9FziDNU6i46y89avb-0/edit?usp=sharing

counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
write.csv(counts, file = 'C:/Users/USER/Documents/avg_ride_length.csv')